library(data.table); library(magrittr)
library(ggplot2); theme_set(theme_bw(base_size = 14))
library(patchwork)
library(SingleCellExperiment)
library(pcaExplorer); library(org.Mm.eg.db)
source("src/marker_gene_extract_and_plot.R")
source("src/load_data_from_box.R")
library(pheatmap)
library(org.Mm.eg.db); library(clusterProfiler)
Arm) and chronic (Cl13) infection
For details of how we retrieved the public data sets, see Schauder2021.Rmd and Yao2019.Rmd.
## Code here does not need to be executed, it's simply for documentation purposes
## to show the details of how we generated the composite integrated file that
## we're providing for download
library(magrittr)
library(batchelor)
library(scran)
library(scater)
library(patchwork)
fnall <- "sce_integrated_with_YaoProgsMpecs-and-Schauder.rds"
fnmerged <- "sceMultiout_integrated_with_YaoProgsMpecs-and-Schauder.rds"
## MAKE A LIST of INDIVIDUAL SCEs ==============================================
## our data -------------
sln <- load_data_from_Box("https://wcm.box.com/shared/static/2wvbdfs3vja2cnlckcqpk20eu1693o8o.rds")
rownames(sln) <- rowData(sln)$ID
scel <- lapply(unique(sln$Sample), function(x){
out.sce <- sln[, sln$Sample == x]
assay(out.sce, "logcounts") <- NULL
colData(out.sce) <- colData(out.sce)[, c("Sample","Barcode","cell")]
rownames(out.sce) <- rowData(out.sce)$ID
return(out.sce)
})
names(scel) <- unique(sln$Sample)
## Schauder -------------
scs <- readRDS("../2021-08_Schauder2021/sce_integrated_with_Schauder2021.rds")
assay(scs, "logcounts") <- NULL
scs <- scs[,!grepl("^pLN",scs$Sample)]
colData(scs) <- colData(scs)[, c("Sample","Barcode","cell")]
rownames(scs) <- rowData(scs)$ID
scel$Schauder <- scs
## Yao -------------
scy <- readRDS("../2021-08_Yao2019/sce_integrated_with_Yao2019.rds")
assay(scy, "logcounts") <- NULL
scy <- scy[,!grepl("^pLN",scy$Sample)]
yaolabs <- read.table("../2021-08_Yao2019/Yao2019_metadata.txt", header = FALSE, skip = 1)
names(yaolabs) <- c("YaoCell","nGene","nUMI","orig.ident","percent.mito","res.0.5","res.1","res.1.5")
## res.0.5 = clusters; 10 = progs, 7 = MPecs
kp_cells <- subset(yaolabs, res.0.5 %in% c(10, 7)) %>% .$YaoCell
scy$YaoCell <- paste(gsub("_[12]$", "", scy$Sample), gsub("-[0-9]$","", scy$Barcode), sep = "_")
for(i in unique(scy$Sample)){
out.sce <- scy[, scy$Sample == i]
out.sce <- out.sce[, out.sce$YaoCell %in% kp_cells]
colData(out.sce) <- colData(out.sce)[, c("Sample","Barcode","cell")]
rownames(out.sce) <- rowData(out.sce)$ID
scel[[i]] <- out.sce}
## PREPPING ============================================================
rd_qc <- lapply(scel,perFeatureQCMetrics)
for(x in seq_along(scel)){rowData(scel[[x]])$qc.mean <- rd_qc[[x]]$mean}
scel2 <- lapply(scel, function(x){ x[rowData(x)$qc.mean > 0.001,]})
## combine
universe <- Reduce(intersect, lapply(scel2, rownames))
scel2 <- lapply(scel2, "[", i=universe)
# generate logcounts
normed.sce <- do.call(multiBatchNorm, scel2) # returns a list
# Identifying a set of HVGs using stats from all batches, using logcounts
all.dec <- lapply(normed.sce, modelGeneVar)
combined.dec <- do.call(combineVar, all.dec)
combined.hvg <- getTopHVGs(combined.dec, n=2000)
## MERGING with MNN ====================================================
## prep
combined <- noCorrect(normed.sce)
assayNames(combined) <- "logcounts"
combined$Sample <- combined$batch
combined$batch <- NULL
set.seed(1010100)
## progressively merge cells from each sample in each batch until all cells
## are mapped onto a common coordinate space
multiout <- fastMNN(combined, batch=combined$Sample, subset.row=combined.hvg)
# Renaming metadata fields for easier communication later.
multiout$Sample <- multiout$batch
## UMAP----------------------------------
set.seed(10101010)
multiout <- runUMAP(multiout, dimred="corrected")
## Clustering -----------------------------
g <- buildSNNGraph(multiout, use.dimred="corrected", k = 20)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k20 <- factor(clusters$membership)
g <- buildSNNGraph(multiout, use.dimred="corrected", k = 50)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k50 <- factor(clusters$membership)
g <- buildSNNGraph(multiout, use.dimred="corrected", k = 10)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k10 <- factor(clusters$membership)
g <- buildSNNGraph(multiout, use.dimred="corrected", k = 5)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k5 <- factor(clusters$membership)
saveRDS(multiout,file = fnmerged)
# generate composite file for the combined file of all shared genes ============
## combine
universe <- Reduce(intersect, lapply(scel, rownames))
scel <- lapply(scel, "[", i=universe)
comb.mat <- lapply(scel, function(x) counts(x)) %>% do.call(cbind, .)
colnames(comb.mat) <- unlist(lapply(scel, function(x) colnames(x)))
### rowData
rd <- rowData(scel[[1]])[, c("ID","Symbol")]
rd <- rd[rownames(comb.mat),]
## colData
cd <- lapply(scel, function(x) colData(x)[, c("Sample","Barcode","cell")]) %>% do.call(rbind, .)
cd <- cd[colnames(comb.mat),]
scAll <- SingleCellExperiment(
assays = list(counts = comb.mat),
colData = cd, rowData = rd)
## add redDims from the merged data set
rdu <- reducedDim(multiout, "UMAP")
reducedDim(scAll, "UMAP") <- rdu[colnames(scAll),]
reducedDim(scAll, "PCA_corr") <- reducedDim(multiout, "corrected")
## add log-counts
qckclst <- quickCluster(scAll, method = "igraph", min.mean = 0.1)
scAll <- computeSumFactors(scAll, min.mean=0.1, cluster = qckclst)
scAll <- scater::logNormCounts(scAll)
saveRDS(scAll, file = fnall)
colData(multiout) %>% saveRDS("colData_multiout_YaoProgsMpecs-and-Schauder.rds")
## fix ColData ======================================================
cd2 <- colData(multiout)
cd2 <- as.data.frame(cd2)
cd2$cell <- rownames(cd2)
cd2 <- cd2[, c("cell",grep("cluster", names(cd2), value=TRUE))]
newcd <- merge(colData(scAll), cd2, by = "cell")
## add YAO LABELS -----------------------------------------------
newcd$YaoCell <- paste(gsub("_[12]$", "", newcd$Sample), gsub("-[0-9]$","", newcd$Barcode), sep = "_")
newcd <- merge(newcd, yaolabs, by="YaoCell", all.x = TRUE)
## add OUR LABELS ------------------------------------------------
cdus <- colData(sln)
newcd <- cdus[, c("cell","CD62L","label")] %>% as.data.frame %>%
merge(newcd, ., on = "cell", all.x = TRUE)
newcd$label <- ifelse(!is.na(newcd$label), newcd$label,
ifelse(grepl("GSM3732587", newcd$Sample), "Schauder",
ifelse(grepl("^D", newcd$Sample), paste0("Yao_", gsub("_1$", "", gsub("_P14","",newcd$Sample))),
"no.clue")))
## 10 = progs, 7 = MPecs
newcd$label2 <- ifelse(is.na(newcd$res.0.5), newcd$label,
ifelse(newcd$res.0.5 == 10, "ProgLike",
ifelse(newcd$res.0.5 == 7, paste(newcd$label, "MPECS", sep = "_"), "no.clue")))
newcd$label2 <- ifelse(newcd$label2 == "Schauder", "Tcm", newcd$label2)
#> table(newcd$label2)
# AIC AMC ProgLike Tcm
# 1704 1335 114 535
# transition Yao_D7_Arm_MPECS Yao_D7_Cl13_MPECS
# 2016 386 2
## finalize colData
rownames(newcd) <- newcd$cell
colData(scAll) <- newcd[colnames(scAll),]
## adjust the naming scheme for our populations
scAll$label <- ifelse(scAll$label == "AIC","AP", ifelse(scAll$label == "transition", "intermediate", ifelse(scAll$label == "AMC", "AM", scAll$label)))
scAll$label2 <- ifelse(scAll$label2 == "AIC","AP", ifelse(scAll$label2 == "transition", "intermediate", ifelse(scAll$label2 == "AMC", "AM", scAll$label2)))
## remove the lonely two cells classified as D7-Cl13-MPECS
scAll <- scAll[, scAll$label2 != "Yao_D7_Cl13_MPECS"]
scAll$label2 <- gsub("Yao_D7_Arm_", "", scAll$label2)
scAll$label2 <- factor(scAll$label2, levels = c("AP","intermediate","AM","MPECS","ProgLike", "Tcm"), ordered = TRUE)
scAll$Ref <- ifelse(grepl("Yao",scAll$label), "Yao", ifelse(grepl("Schauder", scAll$label), "Schauder", "Gearty"))
rownames(scAll) <- scater::uniquifyFeatureNames(ID = rowData(scAll)$ID,
names = rowData(scAll)$Symbol)
saveRDS(scAll, file = fnall)
sc3 <- load_RDSdata_from_Box(shared_link = "https://wcm.box.com/shared/static/ixtkzzow5b4r2ck3u6zit2jtep7dunzv.rds")
## Cached data here: ~/Library/Caches/BiocFileCache/d3a0698a2fae_ixtkzzow5b4r2ck3u6zit2jtep7dunzv.rds
## define color schemes ---------------------------------------------------------
popcols <- c("#1E498F", "#629DD4","#5E3DD8", # dark blue, light blue, intermediate blue
"#228B22", #"bisque1",
"#9ED9B9",#"lightpink1",
"maroon1")
names(popcols) <- c("AP","intermediate","AM",
"Tcm",
"MPECS","ProgLike")
sample_cols <- c(
"D7_P14_Cl13_1" = "sienna4", "D7_P14_Arm_1"="sienna2",
"GSM3732587_Day129"="wheat3",
"pLN_1"="blue","pLN_2"="dodgerblue1","pLN_3"="lightskyblue")
cluster_cols <- c('#B1E2F9','limegreen','grey30','#FFC914','#0066E2','#FC71E9','#00E5CA','#E31A1C','grey83','#FF7F00','#378C07','#6A3D9A','#FFFF99','#B15928')
names(cluster_cols) <- sort(unique(sc3$cluster_with_SchauderYao_k20))
cluster_cols <- cluster_cols[sort(unique(sc3$cluster_with_SchauderYao_k20))]
We can pretend that the individual cell types of interest are pseudo-bulk samples, i.e. we can sum up the reads across all cells of the same label-sample combination (e.g. all cells from Sample “pLN_1” that are labelled as “AP” will form one pseudo-bulk sample “pLN_1_AP”). This helps us to apply more robust bulk-RNA-seq methods for global comparisons across the samples.
summed.scf <- scater::aggregateAcrossCells(sc3,
ids=colData(sc3)[,c("label2","Sample")])
library(edgeR)
y <- DGEList(counts(summed.scf), samples=colData(summed.scf))
keep <- filterByExpr(y, group=summed.scf$Sample)
y <- y[keep,]
y <- calcNormFactors(y)
PCA <- prcomp(t(cpm(y, log = TRUE)), scale = F)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
dataGG = data.frame(PC1 = PCA$x[,1],PC2 = PCA$x[,2],
PC3 = PCA$x[,3],PC4 = PCA$x[,4],
sampleName = row.names(y$samples),
y$samples)
dataGG$MySampleName <- paste(dataGG$Sample.1, dataGG$label2.1, sep = "_")
p1 <- ggplot(dataGG, aes(x = PC1, y = PC2, fill = label2.1, shape = Ref)) +
geom_point(size = 8, alpha = .85) +
xlab(paste0("PC1, VarExp:", round(percentVar[1],4))) +
ylab(paste0("PC2, VarExp:", round(percentVar[2],4))) +
scale_fill_manual(values = popcols, name = "") +
scale_shape_manual(values = c(21,22,23), name = "") +
theme(legend.position = "bottom")
p2 <- ggplot(dataGG, aes(x = PC1, y = PC2, color = label2.1, label = MySampleName)) +
geom_text(size = 6) + scale_color_manual(values = popcols) +
theme(legend.position = "none")
p1 | p2
par(mfrow=c(2,1))
hi_loadings(PCA,whichpc = as.integer(1),topN = 20)
hi_loadings(PCA,whichpc = as.integer(2),topN = 20)
hc <- hclust( as.dist(1-cor(cpm(y, log = TRUE))), method="complete")
hc$labels <- dataGG$MySampleName
#plot(hc)
library(dendextend)
dend <- as.dendrogram(hc)
labels_colors(dend) <- c(rep(popcols[["AP"]], 3), popcols[["AM"]], rep(popcols[["intermediate"]], 3), rep(popcols[["AM"]], 2), popcols[["Tcm"]], popcols[["MPECS"]], popcols[["ProgLike"]])
par(mar=c(15,4,4,2))
rotate(dend, c( 'pLN_1_AP', 'pLN_3_AP', 'pLN_4_AP', 'pLN_1_intermediate', 'pLN_3_intermediate', 'pLN_4_intermediate', 'pLN_1_AM', 'pLN_3_AM', 'pLN_4_AM', 'D7_P14_Arm_1_MPECS', 'D7_P14_Cl13_1_ProgLike','GSM3732587_Day129_Tcm')) %>% plot
comparing AP vs ProgLike vs Tcm and so on
Taking all genes that are either up- or down in any one of the populations (FDR <= 1%) and that are at least in the top 50 for each population.
LabelMarkers <- scran::findMarkers(sc3,
group = sc3$label2)
goi <- lapply(LabelMarkers, function(x) rownames(subset(as.data.frame(x), FDR <= 0.01 & Top <= 50))) %>% unlist %>% unique
## with Tcf7 as a color bar
plot_marker_heatmap(sc3,
gns=unique(c("Tox",goi)),
gns_as_cellAnnotation = c("Tcf7","Tox"), exclude_genes = "Tox",
exprs_values = "logcounts",
show_HKgenes = "both",
n_quant_breaks = 300,
scale="row",col_palette = c("mediumorchid","black","yellow"),
fontsize_row = 7,
add_cell_annotation = data.frame(
row.names = colnames(sc3),
#cluster = sc3$cluster_with_SchauderYao_k20,
population = sc3$label2),
define_anno_cols = list(
#cluster = cluster_cols,
population = popcols,
Tox = scales::brewer_pal("seq", "Reds")(5)[1:4],
Tcf7 = scales::brewer_pal("seq", "Reds")(5)[1:4]),
main = "Marker genes of the individual populations\n(FDR 1%, Rank <= 50)"
)
Determining the top enriched GO terms for markers that are specifically overexpressed in the different populations.
## marker genes
cm.up <- scran::findMarkers(sc3,
group = sc3$label2, direction = "up")
mks.up <- extract_markers(sc3,
marker_search_result = cm.up, FDR_thresh = 0.01,
rank_thresh = 300)
#mks.up <- mks.up[ !grepl("^Rp[ls]", gene_symbol)]
## get ENTREZ IDs
all.entrez <- clusterProfiler::bitr(rownames(sc3),
fromType="SYMBOL", toType="ENTREZID",
OrgDb="org.Mm.eg.db") %>% as.data.table
#all.entrez_noRibo <- all.entrez[!grepl("^Rp[ls]", SYMBOL)]
setnames(all.entrez, names(all.entrez), c("gene_symbol", "entrez"))
## generate list of ENTREZ IDs of interest
eg <- all.entrez[mks.up, on = "gene_symbol"]
## focus on those that are UP and UNIQUELY so per population
clstcomp.list <- lapply(names(cm.up), function(x) eg[up_in == x & classify == "unique"]$entrez )
names(clstcomp.list) <- names(cm.up)
cc.gobp <- compareCluster(clstcomp.list, fun = "enrichGO", OrgDb = org.Mm.eg.db,
ont = "BP", pvalueCutoff=0.05, readable=TRUE, universe = all.entrez$entrez)
## visualization
dotplot(cc.gobp, showCategory = 10) +
ggtitle("Overrepresented GO BP terms for markers that are\nuniquely upregulated in either population") +
scale_color_gradientn(colours =rev(c("azure2", "darkseagreen1", "seagreen2","forestgreen")), limits = c(0, 0.05))
Here are some of the top enriched GO terms in detail:
cnetplot(cc.gobp) + scale_fill_manual(values = popcols, name = "")
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dendextend_1.15.1 edgeR_3.30.3
## [3] limma_3.44.3 BiocFileCache_1.12.1
## [5] dbplyr_2.0.0 clusterProfiler_3.16.1
## [7] pheatmap_1.0.12 org.Mm.eg.db_3.11.4
## [9] AnnotationDbi_1.50.3 pcaExplorer_2.14.2
## [11] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2
## [13] DelayedArray_0.14.1 matrixStats_0.57.0
## [15] Biobase_2.48.0 GenomicRanges_1.40.0
## [17] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [19] S4Vectors_0.26.1 BiocGenerics_0.34.0
## [21] patchwork_1.1.1 ggplot2_3.3.3
## [23] magrittr_2.0.1 data.table_1.13.6
##
## loaded via a namespace (and not attached):
## [1] shinydashboard_0.7.1 tidyselect_1.1.1
## [3] heatmaply_1.2.1 RSQLite_2.2.2
## [5] htmlwidgets_1.5.3 grid_4.0.2
## [7] TSP_1.1-10 BiocParallel_1.22.0
## [9] scatterpie_0.1.5 munsell_0.5.0
## [11] codetools_0.2-18 statmod_1.4.35
## [13] scran_1.16.0 DT_0.17
## [15] withr_2.3.0 colorspace_2.0-0
## [17] GOSemSim_2.14.2 Category_2.54.0
## [19] knitr_1.30 DOSE_3.14.0
## [21] NMF_0.23.0 labeling_0.4.2
## [23] urltools_1.7.3 GenomeInfoDbData_1.2.3
## [25] polyclip_1.10-0 topGO_2.40.0
## [27] bit64_4.0.5 farver_2.0.3
## [29] downloader_0.4 vctrs_0.3.8
## [31] generics_0.1.0 xfun_0.20
## [33] R6_2.5.0 doParallel_1.0.16
## [35] ggbeeswarm_0.6.0 graphlayouts_0.7.1
## [37] rsvd_1.0.3 seriation_1.3.0
## [39] locfit_1.5-9.4 bitops_1.0-6
## [41] shinyAce_0.4.1 fgsea_1.14.0
## [43] gridGraphics_0.5-1 assertthat_0.2.1
## [45] promises_1.1.1 scales_1.1.1
## [47] ggraph_2.0.4 enrichplot_1.8.1
## [49] beeswarm_0.2.3 gtable_0.3.0
## [51] drat_0.1.8 tidygraph_1.2.0
## [53] rlang_0.4.11 genefilter_1.70.0
## [55] splines_4.0.2 lazyeval_0.2.2
## [57] shinyBS_0.61 europepmc_0.4
## [59] BiocManager_1.30.10 yaml_2.2.1
## [61] reshape2_1.4.4 threejs_0.3.3
## [63] crosstalk_1.1.1 httpuv_1.5.4
## [65] qvalue_2.20.0 RBGL_1.64.0
## [67] tools_4.0.2 ggplotify_0.0.5
## [69] gridBase_0.4-7 ellipsis_0.3.2
## [71] RColorBrewer_1.1-2 ggridges_0.5.3
## [73] Rcpp_1.0.6 plyr_1.8.6
## [75] base64enc_0.1-3 progress_1.2.2
## [77] zlibbioc_1.34.0 purrr_0.3.4
## [79] RCurl_1.98-1.2 prettyunits_1.1.1
## [81] openssl_1.4.3 viridis_0.5.1
## [83] cowplot_1.1.1 ggrepel_0.9.0
## [85] cluster_2.1.0 DO.db_2.9
## [87] SparseM_1.78 triebeard_0.3.0
## [89] hms_1.0.0 mime_0.9
## [91] evaluate_0.14 xtable_1.8-4
## [93] XML_3.99-0.5 gridExtra_2.3
## [95] compiler_4.0.2 biomaRt_2.44.4
## [97] scater_1.16.2 tibble_3.0.4
## [99] crayon_1.3.4 htmltools_0.5.1.1
## [101] GOstats_2.54.0 later_1.1.0.1
## [103] tidyr_1.1.2 geneplotter_1.66.0
## [105] DBI_1.1.0 tweenr_1.0.1
## [107] MASS_7.3-53 rappdirs_0.3.3
## [109] Matrix_1.3-4 igraph_1.2.6
## [111] pkgconfig_2.0.3 rvcheck_0.1.8
## [113] registry_0.5-1 plotly_4.9.3
## [115] xml2_1.3.2 foreach_1.5.1
## [117] annotate_1.66.0 vipor_0.4.5
## [119] dqrng_0.2.1 rngtools_1.5
## [121] pkgmaker_0.32.2 webshot_0.5.2
## [123] XVector_0.28.0 AnnotationForge_1.30.1
## [125] stringr_1.4.0 digest_0.6.27
## [127] graph_1.68.0 rmarkdown_2.6
## [129] fastmatch_1.1-0 DelayedMatrixStats_1.10.1
## [131] GSEABase_1.52.1 curl_4.3
## [133] shiny_1.5.0 lifecycle_0.2.0
## [135] jsonlite_1.7.2 BiocNeighbors_1.6.0
## [137] viridisLite_0.3.0 askpass_1.1
## [139] pillar_1.4.7 lattice_0.20-41
## [141] fastmap_1.1.0 httr_1.4.2
## [143] survival_3.2-7 GO.db_3.11.4
## [145] glue_1.4.2 iterators_1.0.13
## [147] bit_4.0.4 Rgraphviz_2.32.0
## [149] ggforce_0.3.2 stringi_1.5.3
## [151] blob_1.2.1 DESeq2_1.28.1
## [153] BiocSingular_1.4.0 memoise_1.1.0
## [155] dplyr_1.0.2 irlba_2.3.3